import pandas as pd_rp
import numpy as np_rp
import matplotlib.pyplot as plt_rp
import seaborn as sns_rp
import plotly.express as px_rp
import math
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import warnings
pd_rp.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")
df_rp = pd_rp.read_csv('/Users/narellavamshi/Downloads/train.csv',index_col='Id')
df_rp_t= pd_rp.read_csv('/Users/narellavamshi/Downloads/test.csv',index_col='Id')
df_rp.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | NaN | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | NaN | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
df_rp_t.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | NaN | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
| 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | NaN | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
| 1464 | 60 | RL | 78.0 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 6 | 1998 | 1998 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 20.0 | TA | TA | PConc | TA | TA | No | GLQ | 602.0 | Unf | 0.0 | 324.0 | 926.0 | GasA | Ex | Y | SBrkr | 926 | 678 | 0 | 1604 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Attchd | 1998.0 | Fin | 2.0 | 470.0 | TA | TA | Y | 360 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
| 1465 | 120 | RL | 43.0 | 5005 | Pave | NaN | IR1 | HLS | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | HdBoard | NaN | 0.0 | Gd | TA | PConc | Gd | TA | No | ALQ | 263.0 | Unf | 0.0 | 1017.0 | 1280.0 | GasA | Ex | Y | SBrkr | 1280 | 0 | 0 | 1280 | 0.0 | 0.0 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 1992.0 | RFn | 2.0 | 506.0 | TA | TA | Y | 0 | 82 | 0 | 0 | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
df_rp.tail()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1456 | 60 | RL | 62.0 | 7917 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 5 | 1999 | 2000 | Gable | CompShg | VinylSd | VinylSd | NaN | 0.0 | TA | TA | PConc | Gd | TA | No | Unf | 0 | Unf | 0 | 953 | 953 | GasA | Ex | Y | SBrkr | 953 | 694 | 0 | 1647 | 0 | 0 | 2 | 1 | 3 | 1 | TA | 7 | Typ | 1 | TA | Attchd | 1999.0 | RFn | 2 | 460 | TA | TA | Y | 0 | 40 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 175000 |
| 1457 | 20 | RL | 85.0 | 13175 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1978 | 1988 | Gable | CompShg | Plywood | Plywood | Stone | 119.0 | TA | TA | CBlock | Gd | TA | No | ALQ | 790 | Rec | 163 | 589 | 1542 | GasA | TA | Y | SBrkr | 2073 | 0 | 0 | 2073 | 1 | 0 | 2 | 0 | 3 | 1 | TA | 7 | Min1 | 2 | TA | Attchd | 1978.0 | Unf | 2 | 500 | TA | TA | Y | 349 | 0 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 2 | 2010 | WD | Normal | 210000 |
| 1458 | 70 | RL | 66.0 | 9042 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 9 | 1941 | 2006 | Gable | CompShg | CemntBd | CmentBd | NaN | 0.0 | Ex | Gd | Stone | TA | Gd | No | GLQ | 275 | Unf | 0 | 877 | 1152 | GasA | Ex | Y | SBrkr | 1188 | 1152 | 0 | 2340 | 0 | 0 | 2 | 0 | 4 | 1 | Gd | 9 | Typ | 2 | Gd | Attchd | 1941.0 | RFn | 1 | 252 | TA | TA | Y | 0 | 60 | 0 | 0 | 0 | 0 | NaN | GdPrv | Shed | 2500 | 5 | 2010 | WD | Normal | 266500 |
| 1459 | 20 | RL | 68.0 | 9717 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1950 | 1996 | Hip | CompShg | MetalSd | MetalSd | NaN | 0.0 | TA | TA | CBlock | TA | TA | Mn | GLQ | 49 | Rec | 1029 | 0 | 1078 | GasA | Gd | Y | FuseA | 1078 | 0 | 0 | 1078 | 1 | 0 | 1 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 1950.0 | Unf | 1 | 240 | TA | TA | Y | 366 | 0 | 112 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 142125 |
| 1460 | 20 | RL | 75.0 | 9937 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1965 | 1965 | Gable | CompShg | HdBoard | HdBoard | NaN | 0.0 | Gd | TA | CBlock | TA | TA | No | BLQ | 830 | LwQ | 290 | 136 | 1256 | GasA | Gd | Y | SBrkr | 1256 | 0 | 0 | 1256 | 1 | 0 | 1 | 1 | 3 | 1 | TA | 6 | Typ | 0 | NaN | Attchd | 1965.0 | Fin | 1 | 276 | TA | TA | Y | 736 | 68 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2008 | WD | Normal | 147500 |
df_rp.info()
<class 'pandas.core.frame.DataFrame'> Index: 1460 entries, 1 to 1460 Data columns (total 80 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSSubClass 1460 non-null int64 1 MSZoning 1460 non-null object 2 LotFrontage 1201 non-null float64 3 LotArea 1460 non-null int64 4 Street 1460 non-null object 5 Alley 91 non-null object 6 LotShape 1460 non-null object 7 LandContour 1460 non-null object 8 Utilities 1460 non-null object 9 LotConfig 1460 non-null object 10 LandSlope 1460 non-null object 11 Neighborhood 1460 non-null object 12 Condition1 1460 non-null object 13 Condition2 1460 non-null object 14 BldgType 1460 non-null object 15 HouseStyle 1460 non-null object 16 OverallQual 1460 non-null int64 17 OverallCond 1460 non-null int64 18 YearBuilt 1460 non-null int64 19 YearRemodAdd 1460 non-null int64 20 RoofStyle 1460 non-null object 21 RoofMatl 1460 non-null object 22 Exterior1st 1460 non-null object 23 Exterior2nd 1460 non-null object 24 MasVnrType 588 non-null object 25 MasVnrArea 1452 non-null float64 26 ExterQual 1460 non-null object 27 ExterCond 1460 non-null object 28 Foundation 1460 non-null object 29 BsmtQual 1423 non-null object 30 BsmtCond 1423 non-null object 31 BsmtExposure 1422 non-null object 32 BsmtFinType1 1423 non-null object 33 BsmtFinSF1 1460 non-null int64 34 BsmtFinType2 1422 non-null object 35 BsmtFinSF2 1460 non-null int64 36 BsmtUnfSF 1460 non-null int64 37 TotalBsmtSF 1460 non-null int64 38 Heating 1460 non-null object 39 HeatingQC 1460 non-null object 40 CentralAir 1460 non-null object 41 Electrical 1459 non-null object 42 1stFlrSF 1460 non-null int64 43 2ndFlrSF 1460 non-null int64 44 LowQualFinSF 1460 non-null int64 45 GrLivArea 1460 non-null int64 46 BsmtFullBath 1460 non-null int64 47 BsmtHalfBath 1460 non-null int64 48 FullBath 1460 non-null int64 49 HalfBath 1460 non-null int64 50 BedroomAbvGr 1460 non-null int64 51 KitchenAbvGr 1460 non-null int64 52 KitchenQual 1460 non-null object 53 TotRmsAbvGrd 1460 non-null int64 54 Functional 1460 non-null object 55 Fireplaces 1460 non-null int64 56 FireplaceQu 770 non-null object 57 GarageType 1379 non-null object 58 GarageYrBlt 1379 non-null float64 59 GarageFinish 1379 non-null object 60 GarageCars 1460 non-null int64 61 GarageArea 1460 non-null int64 62 GarageQual 1379 non-null object 63 GarageCond 1379 non-null object 64 PavedDrive 1460 non-null object 65 WoodDeckSF 1460 non-null int64 66 OpenPorchSF 1460 non-null int64 67 EnclosedPorch 1460 non-null int64 68 3SsnPorch 1460 non-null int64 69 ScreenPorch 1460 non-null int64 70 PoolArea 1460 non-null int64 71 PoolQC 7 non-null object 72 Fence 281 non-null object 73 MiscFeature 54 non-null object 74 MiscVal 1460 non-null int64 75 MoSold 1460 non-null int64 76 YrSold 1460 non-null int64 77 SaleType 1460 non-null object 78 SaleCondition 1460 non-null object 79 SalePrice 1460 non-null int64 dtypes: float64(3), int64(34), object(43) memory usage: 923.9+ KB
df_rp.describe()
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1379.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | 46.549315 | 567.240411 | 1057.429452 | 1162.626712 | 346.992466 | 5.844521 | 1515.463699 | 0.425342 | 0.057534 | 1.565068 | 0.382877 | 2.866438 | 1.046575 | 6.517808 | 0.613014 | 1978.506164 | 1.767123 | 472.980137 | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
| std | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | 161.319273 | 441.866955 | 438.705324 | 386.587738 | 436.528436 | 48.623081 | 525.480383 | 0.518911 | 0.238753 | 0.550916 | 0.502885 | 0.815778 | 0.220338 | 1.625393 | 0.644666 | 24.689725 | 0.747315 | 213.804841 | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
| min | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1900.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 25% | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | 0.000000 | 223.000000 | 795.750000 | 882.000000 | 0.000000 | 0.000000 | 1129.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1961.000000 | 1.000000 | 334.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
| 50% | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | 0.000000 | 477.500000 | 991.500000 | 1087.000000 | 0.000000 | 0.000000 | 1464.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1980.000000 | 2.000000 | 480.000000 | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 75% | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | 0.000000 | 808.000000 | 1298.250000 | 1391.250000 | 728.000000 | 0.000000 | 1776.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
| max | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1474.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 572.000000 | 5642.000000 | 3.000000 | 2.000000 | 3.000000 | 2.000000 | 8.000000 | 3.000000 | 14.000000 | 3.000000 | 2010.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
sns_rp.heatmap(df_rp.isnull(), yticklabels=False, cbar=False, cmap='viridis')
<Axes: ylabel='Id'>
df_rp.isna()
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| 2 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| 3 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| 4 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| 5 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1456 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| 1457 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | True | False | False | False | False | False | False |
| 1458 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False |
| 1459 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
| 1460 | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | True | False | False | False | False | False | False |
1460 rows × 80 columns
print("Ratio of NaN(Not a NULL) in each Column")
print("**********************************************")
nan_ratio_rp = df_rp.isna().mean()
nan_ratio_rp = nan_ratio_rp[nan_ratio_rp > 0]
for col_rp, ratio_rp in nan_ratio_rp.items():
print(f"{col_rp} : {ratio_rp:.2f}")
Ratio of NaN(Not a NULL) in each Column ********************************************** LotFrontage : 0.18 Alley : 0.94 MasVnrType : 0.60 MasVnrArea : 0.01 BsmtQual : 0.03 BsmtCond : 0.03 BsmtExposure : 0.03 BsmtFinType1 : 0.03 BsmtFinType2 : 0.03 Electrical : 0.00 FireplaceQu : 0.47 GarageType : 0.06 GarageYrBlt : 0.06 GarageFinish : 0.06 GarageQual : 0.06 GarageCond : 0.06 PoolQC : 1.00 Fence : 0.81 MiscFeature : 0.96
df_rp.drop(columns=['MasVnrType','PoolQC','PoolArea','BsmtHalfBath','KitchenAbvGr','Utilities'],inplace=True)
df_rp['LotFrontage'] = df_rp['LotFrontage'].fillna(0)
df_rp['Alley'] = df_rp['Alley'].fillna("No Alley Acess")
df_rp['MasVnrArea'] = df_rp['MasVnrArea'].fillna(df_rp['MasVnrArea'].mean())
df_rp['LotFrontage'] = df_rp['LotFrontage'].fillna(0)
df_rp['BsmtQual'] = df_rp['BsmtQual'].fillna("No Basment")
df_rp['BsmtCond'] = df_rp['BsmtCond'].fillna("No Basment")
df_rp['BsmtExposure'] = df_rp['BsmtExposure'].fillna("No Basment")
df_rp['BsmtFinType1'] = df_rp['BsmtFinType1'].fillna("No Basment")
df_rp['BsmtFinType2'] = df_rp['BsmtFinType2'].fillna("No Basment")
df_rp['Electrical'] = df_rp['Electrical'].fillna(df_rp['Electrical'].mode()[0])
df_rp['FireplaceQu'] = df_rp['FireplaceQu'].fillna("No Fireplace")
df_rp['GarageType'] = df_rp['GarageType'].fillna("No Garage")
df_rp['GarageYrBlt'] = df_rp['GarageYrBlt'].fillna(0)
df_rp['GarageFinish'] = df_rp['GarageFinish'].fillna("No Garage")
df_rp['GarageQual'] = df_rp['GarageQual'].fillna("No Garage")
df_rp['GarageCond'] = df_rp['GarageCond'].fillna("No Garage")
df_rp['Fence'] = df_rp['Fence'].fillna("No Fence")
df_rp['MiscFeature'] = df_rp['MiscFeature'].fillna("No Miscellaneous Feature")
from scipy.stats import zscore
def remove_outlier_zscore_rp(df_rp, x, zscore_thres=3):
df_rp[f'{x}_zscore'] = zscore(df_rp[f'{x}'])
df_rp.drop(df_rp[(df_rp[f'{x}_zscore'] >= zscore_thres) | (df_rp[f'{x}_zscore'] <= -zscore_thres)].index, inplace=True)
df_rp.drop(columns=[f'{x}_zscore'], inplace=True)
columns_rps = df_rp.select_dtypes('number').columns.tolist()
for col_rp in columns_rps:
remove_outlier_zscore_rp(df_rp, col_rp)
df_rp.info()
<class 'pandas.core.frame.DataFrame'> Index: 1001 entries, 1 to 1457 Data columns (total 74 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSSubClass 1001 non-null int64 1 MSZoning 1001 non-null object 2 LotFrontage 1001 non-null float64 3 LotArea 1001 non-null int64 4 Street 1001 non-null object 5 Alley 1001 non-null object 6 LotShape 1001 non-null object 7 LandContour 1001 non-null object 8 LotConfig 1001 non-null object 9 LandSlope 1001 non-null object 10 Neighborhood 1001 non-null object 11 Condition1 1001 non-null object 12 Condition2 1001 non-null object 13 BldgType 1001 non-null object 14 HouseStyle 1001 non-null object 15 OverallQual 1001 non-null int64 16 OverallCond 1001 non-null int64 17 YearBuilt 1001 non-null int64 18 YearRemodAdd 1001 non-null int64 19 RoofStyle 1001 non-null object 20 RoofMatl 1001 non-null object 21 Exterior1st 1001 non-null object 22 Exterior2nd 1001 non-null object 23 MasVnrArea 1001 non-null float64 24 ExterQual 1001 non-null object 25 ExterCond 1001 non-null object 26 Foundation 1001 non-null object 27 BsmtQual 1001 non-null object 28 BsmtCond 1001 non-null object 29 BsmtExposure 1001 non-null object 30 BsmtFinType1 1001 non-null object 31 BsmtFinSF1 1001 non-null int64 32 BsmtFinType2 1001 non-null object 33 BsmtFinSF2 1001 non-null int64 34 BsmtUnfSF 1001 non-null int64 35 TotalBsmtSF 1001 non-null int64 36 Heating 1001 non-null object 37 HeatingQC 1001 non-null object 38 CentralAir 1001 non-null object 39 Electrical 1001 non-null object 40 1stFlrSF 1001 non-null int64 41 2ndFlrSF 1001 non-null int64 42 LowQualFinSF 1001 non-null int64 43 GrLivArea 1001 non-null int64 44 BsmtFullBath 1001 non-null int64 45 FullBath 1001 non-null int64 46 HalfBath 1001 non-null int64 47 BedroomAbvGr 1001 non-null int64 48 KitchenQual 1001 non-null object 49 TotRmsAbvGrd 1001 non-null int64 50 Functional 1001 non-null object 51 Fireplaces 1001 non-null int64 52 FireplaceQu 1001 non-null object 53 GarageType 1001 non-null object 54 GarageYrBlt 1001 non-null float64 55 GarageFinish 1001 non-null object 56 GarageCars 1001 non-null int64 57 GarageArea 1001 non-null int64 58 GarageQual 1001 non-null object 59 GarageCond 1001 non-null object 60 PavedDrive 1001 non-null object 61 WoodDeckSF 1001 non-null int64 62 OpenPorchSF 1001 non-null int64 63 EnclosedPorch 1001 non-null int64 64 3SsnPorch 1001 non-null int64 65 ScreenPorch 1001 non-null int64 66 Fence 1001 non-null object 67 MiscFeature 1001 non-null object 68 MiscVal 1001 non-null int64 69 MoSold 1001 non-null int64 70 YrSold 1001 non-null int64 71 SaleType 1001 non-null object 72 SaleCondition 1001 non-null object 73 SalePrice 1001 non-null int64 dtypes: float64(3), int64(31), object(40) memory usage: 586.5+ KB
sns_rp.heatmap(df_rp.isnull(), yticklabels=False, cbar=False, cmap='viridis')
<Axes: ylabel='Id'>
df_rp.shape
(1001, 74)
df_rp.describe()
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 | 1001.000000 |
| mean | 55.024975 | 55.664336 | 9289.864136 | 6.053946 | 5.524476 | 1975.368631 | 1985.549451 | 84.297814 | 416.560440 | 22.208791 | 568.288711 | 1007.057942 | 1103.766234 | 322.947053 | 0.292707 | 1427.005994 | 0.381618 | 1.565435 | 0.369630 | 2.812188 | 6.278721 | 0.546454 | 1979.672328 | 1.810190 | 479.136863 | 88.885115 | 39.817183 | 13.250749 | 0.118881 | 5.208791 | 14.839161 | 6.248751 | 2007.814186 | 171110.950050 |
| std | 39.801688 | 31.676193 | 3830.783887 | 1.253430 | 0.985723 | 27.635864 | 20.522519 | 132.001834 | 397.369021 | 84.511924 | 427.616819 | 365.825514 | 319.603944 | 404.584550 | 4.682226 | 406.913091 | 0.486027 | 0.515717 | 0.482946 | 0.688978 | 1.346565 | 0.595055 | 24.248598 | 0.589861 | 166.960134 | 106.565312 | 49.922235 | 40.553644 | 3.119432 | 26.049671 | 97.684672 | 2.668157 | 1.345154 | 57377.085862 |
| min | 20.000000 | 0.000000 | 1300.000000 | 2.000000 | 3.000000 | 1890.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 438.000000 | 0.000000 | 0.000000 | 438.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 | 0.000000 | 1910.000000 | 1.000000 | 160.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 35311.000000 |
| 25% | 20.000000 | 40.000000 | 7252.000000 | 5.000000 | 5.000000 | 1957.000000 | 1968.000000 | 0.000000 | 0.000000 | 0.000000 | 239.000000 | 791.000000 | 864.000000 | 0.000000 | 0.000000 | 1113.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 5.000000 | 0.000000 | 1962.000000 | 1.000000 | 352.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2007.000000 | 130000.000000 |
| 50% | 50.000000 | 63.000000 | 9120.000000 | 6.000000 | 5.000000 | 1977.000000 | 1995.000000 | 0.000000 | 398.000000 | 0.000000 | 484.000000 | 960.000000 | 1052.000000 | 0.000000 | 0.000000 | 1414.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 6.000000 | 0.000000 | 1983.000000 | 2.000000 | 474.000000 | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 160000.000000 |
| 75% | 70.000000 | 76.000000 | 11040.000000 | 7.000000 | 6.000000 | 2002.000000 | 2004.000000 | 143.000000 | 690.000000 | 0.000000 | 810.000000 | 1234.000000 | 1314.000000 | 703.000000 | 0.000000 | 1682.000000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 7.000000 | 1.000000 | 2003.000000 | 2.000000 | 576.000000 | 168.000000 | 62.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 201000.000000 |
| max | 180.000000 | 152.000000 | 36500.000000 | 10.000000 | 8.000000 | 2009.000000 | 2010.000000 | 640.000000 | 1573.000000 | 532.000000 | 1869.000000 | 2136.000000 | 2136.000000 | 1427.000000 | 80.000000 | 2727.000000 | 1.000000 | 3.000000 | 1.000000 | 5.000000 | 10.000000 | 2.000000 | 2010.000000 | 3.000000 | 954.000000 | 441.000000 | 224.000000 | 192.000000 | 96.000000 | 170.000000 | 1300.000000 | 12.000000 | 2010.000000 | 354000.000000 |
col_rps = df_rp.columns
col_rps
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'ExterQual', 'ExterCond',
'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
'2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold',
'SaleType', 'SaleCondition', 'SalePrice'],
dtype='object')
for column in col_rps:
plt_rp.figure(figsize=(7, 5))
df_rp[column].hist(color='#3498db')
plt_rp.title(f'Histogram for {column}', fontsize=11)
plt_rp.xlabel(column, fontsize=9)
plt_rp.ylabel('y_count', fontsize=9)
plt_rp.show()
plt_rp.figure(figsize=(35, 35))
sns_rp.heatmap(
df_rp.select_dtypes('number').corr(),
vmax=0.7,
annot=True,
square=True,
cmap='Oranges'
)
plt_rp.show()
def plot_numeric_col_with_price_rp(ax_rp, x):
scatter = ax_rp.scatter(x=df_rp[x], y=df_rp['SalePrice'], c=df_rp['SalePrice'], cmap='Blues', alpha=0.75, s=10)
ax_rp.set(xlabel=x, ylabel='Sales Price', title=f'{x} vs Sale Price')
plt_rp.colorbar(scatter, ax=ax_rp, label='SalePrice')
selected_columns_rp = [column for column in df_rp.select_dtypes('number').columns if df_rp[column].nunique() > 16]
num_rows_rp = math.ceil(len(selected_columns_rp) / 2)
fig, axes = plt_rp.subplots(num_rows_rp, 2, figsize=(16, 5 * num_rows_rp))
axes = axes.flatten()
for axlen, column in enumerate(selected_columns_rp):
if axlen >= len(axes):
break
plot_numeric_col_with_price_rp(axes[axlen], column)
plt_rp.tight_layout()
plt_rp.show()
def box_plot_categorical_column_with_price(x, ax_rp):
sns_rp.boxplot(x=x, y='SalePrice', data=df_rp, ax=ax_rp, palette='Blues')
ax_rp.set_xlabel(x)
ax_rp.set_ylabel('Sale Price')
ax_rp.set_xticklabels(ax_rp.get_xticklabels(), rotation=46, ha='right')
ax_rp.set_title(f'Box Plot for Sales Price vs {x}')
columns = df_rp.select_dtypes(exclude='number').columns.tolist()
for c in df_rp.select_dtypes('number').columns.tolist():
if df_rp[f'{c}'].nunique() <= 16:
columns.append(c)
num_columns = 3
num_rows = (len(columns) + num_columns - 1) // num_columns
fig, axes = plt_rp.subplots(num_rows, num_columns, figsize=(16, 6 * num_rows), sharey=True)
axes = axes.flatten()
for num, column in enumerate(columns):
box_plot_categorical_column_with_price(column, axes[num])
for temp in range(len(columns), num_rows * num_columns):
fig.delaxes(axes[temp])
plt_rp.tight_layout()
plt_rp.show()
df_rp = pd_rp.get_dummies(df_rp,dtype=float,drop_first=True)
df_rp.head()
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | MiscVal | MoSold | YrSold | SalePrice | MSZoning_FV | MSZoning_RH | MSZoning_RL | MSZoning_RM | Street_Pave | Alley_No Alley Acess | Alley_Pave | LotShape_IR2 | LotShape_IR3 | LotShape_Reg | LandContour_HLS | LandContour_Low | LandContour_Lvl | LotConfig_CulDSac | LotConfig_FR2 | LotConfig_FR3 | LotConfig_Inside | LandSlope_Mod | LandSlope_Sev | Neighborhood_Blueste | Neighborhood_BrDale | Neighborhood_BrkSide | Neighborhood_ClearCr | Neighborhood_CollgCr | Neighborhood_Crawfor | Neighborhood_Edwards | Neighborhood_Gilbert | Neighborhood_IDOTRR | Neighborhood_MeadowV | Neighborhood_Mitchel | Neighborhood_NAmes | Neighborhood_NPkVill | Neighborhood_NWAmes | Neighborhood_NoRidge | Neighborhood_NridgHt | Neighborhood_OldTown | Neighborhood_SWISU | Neighborhood_Sawyer | Neighborhood_SawyerW | Neighborhood_Somerst | Neighborhood_StoneBr | Neighborhood_Timber | Neighborhood_Veenker | Condition1_Feedr | Condition1_Norm | Condition1_PosA | Condition1_PosN | Condition1_RRAe | Condition1_RRAn | Condition1_RRNe | Condition1_RRNn | Condition2_Norm | Condition2_RRAn | Condition2_RRNn | BldgType_2fmCon | BldgType_Duplex | BldgType_Twnhs | BldgType_TwnhsE | HouseStyle_1.5Unf | HouseStyle_1Story | HouseStyle_2.5Unf | HouseStyle_2Story | HouseStyle_SFoyer | HouseStyle_SLvl | RoofStyle_Gable | RoofStyle_Gambrel | RoofStyle_Hip | RoofStyle_Mansard | RoofMatl_Roll | RoofMatl_Tar&Grv | RoofMatl_WdShngl | Exterior1st_BrkComm | Exterior1st_BrkFace | Exterior1st_CemntBd | Exterior1st_HdBoard | Exterior1st_ImStucc | Exterior1st_MetalSd | Exterior1st_Plywood | Exterior1st_Stucco | Exterior1st_VinylSd | Exterior1st_Wd Sdng | Exterior1st_WdShing | Exterior2nd_AsphShn | Exterior2nd_Brk Cmn | Exterior2nd_BrkFace | Exterior2nd_CmentBd | Exterior2nd_HdBoard | Exterior2nd_ImStucc | Exterior2nd_MetalSd | Exterior2nd_Other | Exterior2nd_Plywood | Exterior2nd_Stone | Exterior2nd_Stucco | Exterior2nd_VinylSd | Exterior2nd_Wd Sdng | Exterior2nd_Wd Shng | ExterQual_Fa | ExterQual_Gd | ExterQual_TA | ExterCond_Gd | ExterCond_TA | Foundation_CBlock | Foundation_PConc | Foundation_Slab | Foundation_Stone | Foundation_Wood | BsmtQual_Fa | BsmtQual_Gd | BsmtQual_No Basment | BsmtQual_TA | BsmtCond_Gd | BsmtCond_No Basment | BsmtCond_TA | BsmtExposure_Gd | BsmtExposure_Mn | BsmtExposure_No | BsmtExposure_No Basment | BsmtFinType1_BLQ | BsmtFinType1_GLQ | BsmtFinType1_LwQ | BsmtFinType1_No Basment | BsmtFinType1_Rec | BsmtFinType1_Unf | BsmtFinType2_BLQ | BsmtFinType2_GLQ | BsmtFinType2_LwQ | BsmtFinType2_No Basment | BsmtFinType2_Rec | BsmtFinType2_Unf | Heating_GasA | Heating_GasW | Heating_Grav | Heating_Wall | HeatingQC_Fa | HeatingQC_Gd | HeatingQC_TA | CentralAir_Y | Electrical_FuseF | Electrical_FuseP | Electrical_SBrkr | KitchenQual_Fa | KitchenQual_Gd | KitchenQual_TA | Functional_Min1 | Functional_Min2 | Functional_Mod | Functional_Typ | FireplaceQu_Fa | FireplaceQu_Gd | FireplaceQu_No Fireplace | FireplaceQu_Po | FireplaceQu_TA | GarageType_Attchd | GarageType_Basment | GarageType_BuiltIn | GarageType_CarPort | GarageType_Detchd | GarageFinish_RFn | GarageFinish_Unf | GarageQual_Fa | GarageQual_Gd | GarageQual_Po | GarageQual_TA | GarageCond_Fa | GarageCond_Gd | GarageCond_Po | GarageCond_TA | PavedDrive_P | PavedDrive_Y | Fence_GdWo | Fence_MnPrv | Fence_MnWw | Fence_No Fence | MiscFeature_Shed | SaleType_CWD | SaleType_Con | SaleType_ConLD | SaleType_ConLI | SaleType_ConLw | SaleType_New | SaleType_Oth | SaleType_WD | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 1 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | 0 | 150 | 856 | 856 | 854 | 0 | 1710 | 1 | 2 | 1 | 3 | 8 | 0 | 2003.0 | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 2 | 2008 | 208500 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | 0 | 284 | 1262 | 1262 | 0 | 0 | 1262 | 0 | 2 | 0 | 3 | 6 | 1 | 1976.0 | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 181500 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | 0 | 434 | 920 | 920 | 866 | 0 | 1786 | 1 | 2 | 1 | 3 | 6 | 1 | 2001.0 | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 9 | 2008 | 223500 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 5 | 60 | 84.0 | 14260 | 8 | 5 | 2000 | 2000 | 350.0 | 655 | 0 | 490 | 1145 | 1145 | 1053 | 0 | 2198 | 1 | 2 | 1 | 4 | 9 | 1 | 2000.0 | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 12 | 2008 | 250000 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 7 | 20 | 75.0 | 10084 | 8 | 5 | 2004 | 2005 | 186.0 | 1369 | 0 | 317 | 1686 | 1694 | 0 | 0 | 1694 | 1 | 2 | 0 | 3 | 7 | 1 | 2004.0 | 2 | 636 | 255 | 57 | 0 | 0 | 0 | 0 | 8 | 2007 | 307000 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
target_col = df_rp.columns.get_loc('SalePrice')
X = df_rp.iloc[:, [col_rp for col_rp in range(df_rp.shape[1]) if col_rp != target_col]].values
y = df_rp.iloc[:,target_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=256)
X_train_nstd = X_train.copy()
X_test_nstd = X_test.copy()
y_train_nstd = y_train.copy()
X_train_xg= X_train.copy()
X_test_xg=X_test.copy()
y_train_xg= y_train.copy()
y_test_xg=y_test.copy()
sc_X = StandardScaler()
X_train[:, :34] = sc_X.fit_transform(X_train[:, :34])
X_test[:, :34] = sc_X.transform(X_test[:, :34])
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train.reshape(-1,1)).flatten()
multi_lr_rp = LinearRegression()
multi_lr_rp.fit(X_train_nstd, y_train_nstd)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Printing Training and Testing sets R2 and MSE Scores
def print_scores(r2_train_rp, r2_test_rp, mse_train_rp, mse_test_rp, algo_rp):
print("R2 Train Score using {}: {}".format(algo_rp, r2_train_rp))
print("R2 Test Score using {} : {}".format(algo_rp, r2_test_rp))
print("Mean Squared Error of Train using {} : {}".format(algo_rp, mse_train_rp))
print("Mean Squared Error of Test using {} : {}".format(algo_rp, mse_test_rp))
y_pred = multi_lr_rp.predict(X_test_nstd)
y_train_pred = multi_lr_rp.predict(X_train_nstd)
r2_lr_train_rp = r2_score(y_train_nstd, y_train_pred)
r2_lr_test_rp = r2_score(y_test, y_pred.reshape(-1,1))
mse_lr_train_rp = mean_squared_error(y_train_nstd, y_train_pred)
mse_lr_test_rp = mean_squared_error(y_test, y_pred.reshape(-1,1))
print_scores(r2_lr_train_rp,r2_lr_test_rp,mse_lr_train_rp,mse_lr_test_rp,"Linear Regression")
R2 Train Score using Linear Regression: 0.9491941900302114 R2 Test Score using Linear Regression : 0.8967704699231831 Mean Squared Error of Train using Linear Regression : 165463735.30246514 Mean Squared Error of Test using Linear Regression : 352617475.9582878
poly_reg_rp = PolynomialFeatures(degree = 2)
X_train_poly = poly_reg_rp.fit_transform(X_train)
X_test_poly = poly_reg_rp.transform(X_test)
poly_lr_rp = LinearRegression()
poly_lr_rp.fit(X_train_poly, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
y_pred = poly_lr_rp.predict(X_test_poly)
y_train_pred = poly_lr_rp.predict(X_train_poly)
r2_poly_train_rp = r2_score(y_train,y_train_pred)
r2_poly_test_rp = r2_score(y_test,sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_poly_train_rp = mean_squared_error(y_train, y_train_pred)
mse_poly_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_poly_train_rp, r2_poly_test_rp, mse_poly_train_rp, mse_poly_test_rp, "Polynomial Regression")
R2 Train Score using Polynomial Regression: 1.0 R2 Test Score using Polynomial Regression : 0.8253451949027779 Mean Squared Error of Train using Polynomial Regression : 8.631847175963858e-28 Mean Squared Error of Test using Polynomial Regression : 596596114.4213339
rdm_frst_rps_75 = RandomForestRegressor(n_estimators = 75)
rdm_frst_rps_75.fit(X_train, y_train)
RandomForestRegressor(n_estimators=75)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(n_estimators=75)
rdm_frst_rps_100 = RandomForestRegressor(n_estimators = 100)
rdm_frst_rps_100.fit(X_train, y_train)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
rdm_frst_rps_125 = RandomForestRegressor(n_estimators = 125)
rdm_frst_rps_125.fit(X_train, y_train)
RandomForestRegressor(n_estimators=125)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(n_estimators=125)
y_pred = rdm_frst_rps_100.predict(X_test)
y_train_pred = rdm_frst_rps_100.predict(X_train)
r2_frst_train_rp = r2_score(y_train, y_train_pred)
r2_frst_test_rp = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_frst_train_rp= mean_squared_error(y_train, y_train_pred)
mse_frst_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_frst_train_rp, r2_frst_test_rp, mse_frst_train_rp, mse_frst_test_rp, "Random Forest Regression (100)")
R2 Train Score using Random Forest Regression (100): 0.9834267646891539 R2 Test Score using Random Forest Regression (100) : 0.876744889688229 Mean Squared Error of Train using Random Forest Regression (100) : 0.016573235310846136 Mean Squared Error of Test using Random Forest Regression (100) : 421022026.010924
y_pred = rdm_frst_rps_75.predict(X_test)
y_train_pred = rdm_frst_rps_75.predict(X_train)
r2_frst_train_75 = r2_score(y_train, y_train_pred)
r2_frst_test_75 = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_frst_train_75 = mean_squared_error(y_train, y_train_pred)
mse_frst_test_75 = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_frst_train_75, r2_frst_test_75, mse_frst_train_75, mse_frst_test_75, "Random Forest Regression (75)")
R2 Train Score using Random Forest Regression (75): 0.9826005742915324 R2 Test Score using Random Forest Regression (75) : 0.8805141613357595 Mean Squared Error of Train using Random Forest Regression (75) : 0.01739942570846763 Mean Squared Error of Test using Random Forest Regression (75) : 408146727.1156918
y_pred = rdm_frst_rps_125.predict(X_test)
y_train_pred = rdm_frst_rps_125.predict(X_train)
r2_frst_train_rp = r2_score(y_train, y_train_pred)
r2_frst_test_rp = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_frst_train_rp = mean_squared_error(y_train, y_train_pred)
mse_frst_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_frst_train_rp, r2_frst_test_rp, mse_frst_train_rp, mse_frst_test_rp, "Random Forest Regression (125)")
R2 Train Score using Random Forest Regression (125): 0.9834972738384505 R2 Test Score using Random Forest Regression (125) : 0.878074760293716 Mean Squared Error of Train using Random Forest Regression (125) : 0.01650272616154943 Mean Squared Error of Test using Random Forest Regression (125) : 416479376.09370524
svr_lrp_rs = SVR(kernel = 'linear')
svr_lrp_rs.fit(X_train, y_train)
SVR(kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVR(kernel='linear')
y_pred = svr_lrp_rs.predict(X_test)
y_train_pred = svr_lrp_rs.predict(X_train)
r2_svr_lr_train_rp = r2_score(y_train, y_train_pred)
r2_svr_lr_test_rp = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_svr_lr_train_rp = mean_squared_error(y_train, y_train_pred)
mse_svr_lr_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_svr_lr_train_rp, r2_svr_lr_test_rp, mse_svr_lr_train_rp, mse_svr_lr_test_rp, "Support Vector Regression")
R2 Train Score using Support Vector Regression: 0.9439305022854688 R2 Test Score using Support Vector Regression : 0.9138355058165617 Mean Squared Error of Train using Support Vector Regression : 0.05606949771453119 Mean Squared Error of Test using Support Vector Regression : 294325726.69445837
svr_rbf_rp = SVR(kernel = 'rbf')
svr_rbf_rp.fit(X_train, y_train)
SVR()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVR()
y_pred = svr_rbf_rp.predict(X_test)
y_train_pred = svr_rbf_rp.predict(X_train)
r2_svr_train_rp = r2_score(y_train, y_train_pred)
r2_svr_test_rp = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_svr_train_rp = mean_squared_error(y_train, y_train_pred)
mse_svr_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_svr_train_rp, r2_svr_test_rp, mse_svr_train_rp, mse_svr_test_rp, "Support Vector Regression_rbf")
R2 Train Score using Support Vector Regression_rbf: 0.9677502574058094 R2 Test Score using Support Vector Regression_rbf : 0.9005401855006185 Mean Squared Error of Train using Support Vector Regression_rbf : 0.032249742594190514 Mean Squared Error of Test using Support Vector Regression_rbf : 339740660.6612818
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
param_grid_rs = {
'learning_rate': [ 0.1, 0.2, 0.3, 0.4],
'gamma': [0.01, 0.1, 0.2, 0.3, 0.4]
}
xgb = XGBRegressor()
grid_search_rp = GridSearchCV(estimator=xgb, param_grid=param_grid_rs, scoring='neg_mean_squared_error', cv=5)
grid_search_rp.fit(X_train_xg, y_train_xg)
best_params = grid_search_rp.best_params_
print("Best Hyperparameters:", best_params)
y_pred = grid_search.best_estimator_.predict(X_test_xg)
Best Hyperparameters: {'gamma': 0.01, 'learning_rate': 0.1}
xgb_rp = XGBRegressor(learning_rate=0.1,gamma=0.01)
xgb_rp.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0.01, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0.01, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)y_pred = xgb_rp.predict(X_test)
y_train_pred = xgb_rp.predict(X_train)
r2_xgb_train_rp = r2_score(y_train, y_train_pred)
r2_xgb_test_rp = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_xgb_train_rp = mean_squared_error(y_train, y_train_pred)
mse_xgb_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_xgb_train_rp, r2_xgb_test_rp, mse_xgb_train_rp, mse_xgb_test_rp, "XGBoost")
R2 Train Score using XGBoost: 0.9960759723139573 R2 Test Score using XGBoost : 0.8813379856508698 Mean Squared Error of Train using XGBoost : 0.00392402768604268 Mean Squared Error of Test using XGBoost : 405332659.76102
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
knn_test = KNeighborsRegressor(n_neighbors=500)
knn_test.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_train_pred = knn.predict(X_train)
r2_knn_train_rp = r2_score(y_train, y_train_pred)
r2_knn_test_rp = r2_score(y_test, y_pred)
mse_knn_train_rp = mean_squared_error(y_train, y_train_pred)
mse_knn_test_rp = mean_squared_error(y_test, y_pred)
print_scores(r2_knn_train_rp, r2_knn_test_rp, mse_knn_train_rp, mse_knn_test_rp , "KNN")
R2 Train Score using KNN: 0.8902326710346942 R2 Test Score using KNN : -8.504651413383005 Mean Squared Error of Train using KNN : 0.10976732896530578 Mean Squared Error of Test using KNN : 32466545074.42308
y_pred = knn_test.predict(X_test)
y_train_pred = knn_test.predict(X_train)
r2_knn_train_rp = r2_score(y_train, y_train_pred)
r2_knn_test_rp = r2_score(y_test, y_pred)
mse_knn_train_rp = mean_squared_error(y_train, y_train_pred)
mse_knn_test_rp = mean_squared_error(y_test, y_pred)
print_scores(r2_knn_train_rp, r2_knn_test_rp, mse_knn_train_rp, mse_knn_test_rp , "KNN(500)")
R2 Train Score using KNN(500): 0.4666241814351054 R2 Test Score using KNN(500) : -8.504667145091746 Mean Squared Error of Train using KNN(500) : 0.5333758185648946 Mean Squared Error of Test using KNN(500) : 32466598811.715347
models = pd_rp.DataFrame({
'Model': [
'Multiple Linear Regression','Polynomial Regression',
'Random Forest', 'Support Vector Regression','Linear Support Vector Regression','XGBoost','KNN'
],
'Training R2 Score': [
r2_lr_train_rp,r2_poly_train_rp,r2_frst_train_75,r2_svr_train_rp,r2_svr_lr_train_rp,r2_xgb_train_rp, r2_knn_train_rp
],
'Training Mean Square Error': [
mse_lr_train_rp,mse_poly_train_rp,mse_frst_train_75,mse_svr_train_rp,mse_svr_lr_train_rp,mse_xgb_train_rp, mse_knn_train_rp
],
'Testing R2 Score': [
r2_lr_test_rp,r2_poly_test_rp,r2_frst_test_75,r2_svr_test_rp,r2_svr_lr_test_rp,r2_xgb_test_rp , r2_knn_test_rp
],
'Testing Mean Square Error': [
mse_lr_test_rp,mse_poly_test_rp,mse_frst_test_75,mse_svr_test_rp,mse_svr_lr_test_rp,mse_xgb_test_rp, mse_knn_test_rp
]
})
models.sort_values(by='Testing R2 Score', ascending=False).style.background_gradient(
cmap='Blues')
| Model | Training R2 Score | Training Mean Square Error | Testing R2 Score | Testing Mean Square Error | |
|---|---|---|---|---|---|
| 4 | Linear Support Vector Regression | 0.943931 | 0.056069 | 0.913836 | 294325726.694458 |
| 3 | Support Vector Regression | 0.967750 | 0.032250 | 0.900540 | 339740660.661282 |
| 0 | Multiple Linear Regression | 0.949194 | 165463735.302465 | 0.896770 | 352617475.958288 |
| 5 | XGBoost | 0.996076 | 0.003924 | 0.881338 | 405332659.761020 |
| 2 | Random Forest | 0.982601 | 0.017399 | 0.880514 | 408146727.115692 |
| 1 | Polynomial Regression | 1.000000 | 0.000000 | 0.825345 | 596596114.421334 |
| 6 | KNN | 0.466624 | 0.533376 | -8.504667 | 32466598811.715347 |
y_pred = svr_lrp_rs.predict(X_test)
y_train_pred = svr_lrp_rs.predict(X_train)
r2_svr_lr_train_rp = r2_score(y_train, y_train_pred)
r2_svr_lr_test_rp = r2_score(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
mse_svr_lr_train_rp = mean_squared_error(y_train, y_train_pred)
mse_svr_lr_test_rp = mean_squared_error(y_test, sc_y.inverse_transform(y_pred.reshape(-1,1)))
print_scores(r2_svr_lr_train_rp, r2_svr_lr_test_rp, mse_svr_lr_train_rp, mse_svr_lr_test_rp, "Support Vector Regression")
ids = np.arange(1, len(y_test) + 1)
csv_file_path = "model_scores.csv"
with open(csv_file_path, mode='a', newline='') as file:
writer = csv.writer(file)
for i, saleprice in zip(ids, sc_y.inverse_transform(y_pred.reshape(-1,1))):
writer.writerow([i, saleprice[0]])
print(f"Scores saved to {csv_file_path}")
R2 Train Score using Support Vector Regression: 0.9439305022854688 R2 Test Score using Support Vector Regression : 0.9138355058165617 Mean Squared Error of Train using Support Vector Regression : 0.05606949771453119 Mean Squared Error of Test using Support Vector Regression : 294325726.69445837 Scores saved to model_scores.csv